Load Library
library(tidyverse)
library(tidyr)
library(tidymodels)
library(tidytext)
library(janitor)
library(skimr)
library(vip)
library(parallel)
library(doParallel)
library(rpart.plot)
library(textrecipes)
library(stringi)
library(xgboost)
library(DALEXtra)
library(DALEX)
library(MASS)
library(solitude)
library(rpart.plot)
library(ggpubr)
Import Data
loan <- read_csv("loan_train.csv", na = c("NA", "", "-")) %>%
mutate(int_rate = as.numeric(str_replace(int_rate, "%", "")),
revol_util = as.numeric(str_replace(revol_util, "%", "")),
empl_year = word(emp_length)) %>% clean_names()
## Rows: 29777 Columns: 52
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (23): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne...
## dbl (29): id, member_id, loan_amnt, funded_amnt, funded_amnt_inv, installmen...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skim(loan)
Data summary
| Name |
loan |
| Number of rows |
29777 |
| Number of columns |
53 |
| _______________________ |
|
| Column type frequency: |
|
| character |
22 |
| numeric |
31 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| term |
3 |
1.00 |
9 |
9 |
0 |
2 |
0 |
| grade |
3 |
1.00 |
1 |
1 |
0 |
7 |
0 |
| sub_grade |
3 |
1.00 |
2 |
2 |
0 |
35 |
0 |
| emp_title |
1817 |
0.94 |
2 |
75 |
0 |
22143 |
0 |
| emp_length |
3 |
1.00 |
3 |
9 |
0 |
12 |
0 |
| home_ownership |
3 |
1.00 |
3 |
8 |
0 |
5 |
0 |
| verification_status |
3 |
1.00 |
8 |
15 |
0 |
3 |
0 |
| issue_d |
3 |
1.00 |
8 |
8 |
0 |
55 |
0 |
| loan_status |
0 |
1.00 |
7 |
7 |
0 |
2 |
0 |
| pymnt_plan |
3 |
1.00 |
1 |
1 |
0 |
2 |
0 |
| url |
3 |
1.00 |
62 |
64 |
0 |
29774 |
0 |
| desc |
9432 |
0.68 |
1 |
3943 |
0 |
20310 |
0 |
| purpose |
3 |
1.00 |
3 |
18 |
0 |
14 |
0 |
| title |
13 |
1.00 |
2 |
80 |
0 |
15200 |
0 |
| zip_code |
3 |
1.00 |
5 |
5 |
0 |
819 |
0 |
| addr_state |
3 |
1.00 |
2 |
2 |
0 |
50 |
0 |
| earliest_cr_line |
23 |
1.00 |
8 |
8 |
0 |
516 |
0 |
| last_pymnt_d |
67 |
1.00 |
8 |
8 |
0 |
106 |
0 |
| next_pymnt_d |
27425 |
0.08 |
8 |
8 |
0 |
96 |
0 |
| last_credit_pull_d |
5 |
1.00 |
8 |
8 |
0 |
109 |
0 |
| application_type |
3 |
1.00 |
10 |
10 |
0 |
1 |
0 |
| empl_year |
3 |
1.00 |
1 |
3 |
0 |
12 |
0 |
Variable type: numeric
| id |
3 |
1.00 |
663006.18 |
219881.52 |
54734.00 |
496344.25 |
642763.00 |
825647.50 |
1077501.00 |
▁▃▇▆▅ |
| member_id |
3 |
1.00 |
823568.15 |
280420.33 |
70473.00 |
635380.00 |
822247.50 |
1033656.25 |
1314167.00 |
▁▃▇▇▆ |
| loan_amnt |
3 |
1.00 |
11109.43 |
7404.65 |
500.00 |
5225.00 |
9800.00 |
15000.00 |
35000.00 |
▇▇▃▂▁ |
| funded_amnt |
3 |
1.00 |
10843.64 |
7147.05 |
500.00 |
5100.00 |
9600.00 |
15000.00 |
35000.00 |
▇▇▃▂▁ |
| funded_amnt_inv |
3 |
1.00 |
10149.66 |
7130.86 |
0.00 |
4950.00 |
8500.00 |
14000.00 |
35000.00 |
▇▆▃▁▁ |
| int_rate |
3 |
1.00 |
12.17 |
3.72 |
5.42 |
9.63 |
11.99 |
14.72 |
24.11 |
▆▇▇▂▁ |
| installment |
3 |
1.00 |
323.81 |
209.77 |
15.67 |
165.84 |
278.94 |
429.86 |
1305.19 |
▇▆▂▁▁ |
| annual_inc |
4 |
1.00 |
69201.23 |
66566.42 |
2000.00 |
40000.00 |
59000.00 |
82500.00 |
6000000.00 |
▇▁▁▁▁ |
| dti |
3 |
1.00 |
13.38 |
6.74 |
0.00 |
8.19 |
13.49 |
18.70 |
29.99 |
▅▇▇▆▂ |
| delinq_2yrs |
23 |
1.00 |
0.16 |
0.52 |
0.00 |
0.00 |
0.00 |
0.00 |
13.00 |
▇▁▁▁▁ |
| fico_range_low |
3 |
1.00 |
713.05 |
36.31 |
610.00 |
685.00 |
710.00 |
740.00 |
825.00 |
▁▇▇▅▁ |
| fico_range_high |
3 |
1.00 |
717.05 |
36.31 |
614.00 |
689.00 |
714.00 |
744.00 |
829.00 |
▁▇▇▅▁ |
| inq_last_6mths |
23 |
1.00 |
1.08 |
1.54 |
0.00 |
0.00 |
1.00 |
2.00 |
33.00 |
▇▁▁▁▁ |
| mths_since_last_delinq |
18907 |
0.37 |
34.72 |
22.32 |
0.00 |
17.00 |
32.00 |
51.00 |
120.00 |
▇▇▅▂▁ |
| mths_since_last_record |
27208 |
0.09 |
59.23 |
47.22 |
0.00 |
0.00 |
85.00 |
102.00 |
129.00 |
▇▁▂▆▅ |
| open_acc |
23 |
1.00 |
9.34 |
4.51 |
1.00 |
6.00 |
9.00 |
12.00 |
47.00 |
▇▃▁▁▁ |
| pub_rec |
23 |
1.00 |
0.06 |
0.25 |
0.00 |
0.00 |
0.00 |
0.00 |
5.00 |
▇▁▁▁▁ |
| revol_bal |
3 |
1.00 |
14310.00 |
22696.55 |
0.00 |
3644.00 |
8783.50 |
17187.00 |
1207359.00 |
▇▁▁▁▁ |
| revol_util |
67 |
1.00 |
49.09 |
28.33 |
0.00 |
25.83 |
49.50 |
72.60 |
119.00 |
▇▇▇▇▁ |
| total_acc |
23 |
1.00 |
22.08 |
11.59 |
1.00 |
13.00 |
20.00 |
29.00 |
81.00 |
▇▇▂▁▁ |
| out_prncp |
3 |
1.00 |
11.80 |
123.53 |
0.00 |
0.00 |
0.00 |
0.00 |
3126.61 |
▇▁▁▁▁ |
| out_prncp_inv |
3 |
1.00 |
11.76 |
123.23 |
0.00 |
0.00 |
0.00 |
0.00 |
3123.44 |
▇▁▁▁▁ |
| total_rec_late_fee |
3 |
1.00 |
1.50 |
7.72 |
0.00 |
0.00 |
0.00 |
0.00 |
180.20 |
▇▁▁▁▁ |
| last_pymnt_amnt |
3 |
1.00 |
2615.41 |
4373.70 |
0.00 |
211.02 |
531.98 |
3171.29 |
36115.20 |
▇▁▁▁▁ |
| collections_12_mths_ex_med |
104 |
1.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
▁▁▇▁▁ |
| policy_code |
3 |
1.00 |
1.00 |
0.00 |
1.00 |
1.00 |
1.00 |
1.00 |
1.00 |
▁▁▇▁▁ |
| acc_now_delinq |
23 |
1.00 |
0.00 |
0.01 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| chargeoff_within_12_mths |
104 |
1.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
▁▁▇▁▁ |
| delinq_amnt |
23 |
1.00 |
0.20 |
35.09 |
0.00 |
0.00 |
0.00 |
0.00 |
6053.00 |
▇▁▁▁▁ |
| pub_rec_bankruptcies |
966 |
0.97 |
0.05 |
0.21 |
0.00 |
0.00 |
0.00 |
0.00 |
2.00 |
▇▁▁▁▁ |
| tax_liens |
79 |
1.00 |
0.00 |
0.01 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
Exploratory Analysis - Categorical
loan <- loan %>%
mutate_if(is.character, as.factor) %>%
mutate(loan_status = factor(loan_status))
loan %>%
count(loan_status) %>%
mutate(pct = n/sum(n)) -> default_rate
default_rate %>%
ggplot(aes(x = loan_status, y = pct)) +
geom_col() +
geom_text(aes(label = round(pct, 4)), color = "red") +
labs(title = "Loan Default Rate")

# -- explore factors
char_explore <- function(col){loan %>%
na.omit() %>%
ggplot(., aes(!!as.name(col))) +
geom_bar(aes(fill=loan_status), position="fill") +
theme(axis.text.x=element_text(angle=90, hjust=1)) +
labs(title = col)
}
for (column in names(loan %>% select_if(is.factor))){
chrt <- char_explore(column)
print(chrt)
}






















# -- term
loan %>%
group_by(term, loan_status) %>%
summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
ggplot(aes(x = term, y = pct, fill = loan_status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
labs(title = "% Default, Term",
x = "Term",
y = "Percentage Current vs. Default",
fill = "Loan Default") +
geom_hline(yintercept = 0.1503509,
linetype = "dashed",
color = "black",
size = 0.5) + coord_flip()

# -- grade
loan %>%
group_by(grade, loan_status) %>%
summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
ggplot(aes(x = grade, y = pct, fill = loan_status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
labs(title = "% Default, Grade",
x = "Grade",
y = "Percentage Current vs. Default",
fill = "Loan Default") +
geom_hline(yintercept = 0.1503509,
linetype = "dashed",
color = "black",
size = 0.5) + coord_flip()

# -- sub grade
loan %>%
group_by(sub_grade, loan_status) %>%
summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
ggplot(aes(x = sub_grade, y = pct, fill = loan_status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
labs(title = "% Default, Sub Grade",
x = "Sub Grade",
y = "Percentage Current vs. Default",
fill = "Loan Default") +
geom_hline(yintercept = 0.1503509,
linetype = "dashed",
color = "black",
size = 0.5) + coord_flip()

# -- emp_length
loan %>%
group_by(emp_length, loan_status) %>%
summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
ggplot(aes(x = emp_length, y = pct, fill = loan_status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
labs(title = "% Default, Emp Length",
x = "Emp Length",
y = "Percentage Current vs. Default",
fill = "Loan Default") +
geom_hline(yintercept = 0.1503509,
linetype = "dashed",
color = "black",
size = 0.5) + coord_flip()

# -- home ownership
loan %>%
group_by(home_ownership, loan_status) %>%
summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
ggplot(aes(x = home_ownership, y = pct, fill = loan_status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
labs(title = "% Default, Home Ownership",
x = "Home Ownership",
y = "Percentage Current vs. Default",
fill = "Loan Default") +
geom_hline(yintercept = 0.1503509,
linetype = "dashed",
color = "black",
size = 0.5) + coord_flip()

# -- verification status
loan %>%
group_by(verification_status, loan_status) %>%
summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
ggplot(aes(x = verification_status, y = pct, fill = loan_status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
labs(title = "% Default, Verification Status",
x = "Verification Status",
y = "Percentage Current vs. Default",
fill = "Loan Default") +
geom_hline(yintercept = 0.1503509,
linetype = "dashed",
color = "black",
size = 0.5) + coord_flip()

# -- payment plan
loan %>%
group_by(pymnt_plan, loan_status) %>%
summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
ggplot(aes(x = pymnt_plan, y = pct, fill = loan_status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
labs(title = "% Default, Payment Plan",
x = "Payment Plan",
y = "Percentage Current vs. Default",
fill = "Loan Default") +
geom_hline(yintercept = 0.1503509,
linetype = "dashed",
color = "black",
size = 0.5) + coord_flip()

# -- purpose
loan %>%
group_by(purpose, loan_status) %>%
summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
ggplot(aes(x = purpose, y = pct, fill = loan_status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
labs(title = "% Default,Purpose",
x = "Purpose",
y = "Percentage Current vs. Default",
fill = "Loan Default") +
geom_hline(yintercept = 0.1503509,
linetype = "dashed",
color = "black",
size = 0.5) + coord_flip()

# -- acc_now_delinq
loan %>%
group_by(acc_now_delinq, loan_status) %>%
summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
ggplot(aes(x = acc_now_delinq, y = pct, fill = loan_status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
labs(title = "% Default, Account Now Delinq",
x = "Account Now Delinq",
y = "Percentage Current vs. Default",
fill = "Loan Default") +
geom_hline(yintercept = 0.1503509,
linetype = "dashed",
color = "black",
size = 0.5) + coord_flip()

# -- bankruptcy
loan %>%
group_by(pub_rec_bankruptcies, loan_status) %>%
summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
ggplot(aes(x = pub_rec_bankruptcies, y = pct, fill = loan_status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
labs(title = "% Default, Pub Rec Bankruptcies",
x = "Pub Rec Bankruptcies",
y = "Percentage Current vs. Default",
fill = "Loan Default") +
geom_hline(yintercept = 0.1503509,
linetype = "dashed",
color = "black",
size = 0.5) + coord_flip()

# -- tax_liens
loan %>%
group_by(tax_liens, loan_status) %>%
summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
ggplot(aes(x = tax_liens, y = pct, fill = loan_status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
labs(title = "% Default, tax_liens",
x = "tax_liens",
y = "Percentage Current vs. Default",
fill = "Loan Default") +
geom_hline(yintercept = 0.1503509,
linetype = "dashed",
color = "black",
size = 0.5) + coord_flip()

# -- policy code
loan %>%
group_by(policy_code, loan_status) %>%
summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
ggplot(aes(x = policy_code, y = pct, fill = loan_status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
labs(title = "% Default, Policy Code",
x = "Policy Code",
y = "Percentage Current vs. Default",
fill = "Loan Default") +
geom_hline(yintercept = 0.1503509,
linetype = "dashed",
color = "black",
size = 0.5) + coord_flip()

# -- address state
loan %>%
group_by(addr_state, loan_status) %>%
summarize(count = n()) %>%
mutate(pct = count/sum(count)) %>%
ggplot(aes(x = addr_state, y = pct, fill = loan_status)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("gray", "dodgerblue"), labels = c("current", "default")) +
labs(title = "% Default, Address State",
x = "Address State",
y = "Percentage Current vs. Default",
fill = "Loan Default") +
geom_hline(yintercept = 0.1503509,
linetype = "dashed",
color = "black",
size = 0.5) + coord_flip()

Exploratory Analysis - Numeric
n_cols <- names(loan %>% select_if(is.numeric) %>% dplyr::select(-id, -member_id))
my_hist <- function(col){
loan %>%
summarise(n = n(),
n_miss = sum(is.na(!!as.name(col))),
n_dist = n_distinct(!!as.name(col)),
mean = round(mean(!!as.name(col), na.rm = TRUE), 2),
min = min(!!as.name(col), na.rm = TRUE),
max = max(!!as.name(col), na.rm = TRUE)) -> col_summary
p <- ggtexttable(col_summary, rows = NULL,
theme = ttheme("mOrange"))
l1 <- loan %>%
ggplot(aes(x = !!as.name(col), fill = loan_status)) +
geom_histogram(bins = 30) +
labs(title = col)
plt <- ggarrange(l1, p, ncol = 1, nrow = 2, heights = c(1, 0.3))
print(plt)
}
for (c in n_cols){
my_hist(c)
}





























# -- correlation matrix
cor <- loan %>%
na.omit() %>%
select_if(is.numeric) %>%
cor() %>%
as.data.frame() %>%
rownames_to_column(var = "variable")
cor %>%
pivot_longer(cols = c("loan_amnt",
"funded_amnt",
"funded_amnt_inv",
"installment",
"annual_inc",
"dti",
"delinq_2yrs",
"fico_range_low",
"fico_range_high",
"inq_last_6mths",
"mths_since_last_delinq",
"mths_since_last_record",
"open_acc",
"pub_rec",
"revol_bal",
"total_acc",
"out_prncp",
"out_prncp_inv",
"total_rec_late_fee",
"delinq_amnt"),
names_to = "name",
values_to = "correlation") %>%
ggplot(aes(x = variable, y = name, fill = correlation)) +
geom_tile() +
labs(title = "Correlation Matrix",
x = "Variable",
y = "Variable") +
scale_fill_gradient2(mid = "#FBFEF9",
low = "#0C6291",
high = "#A63446") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
geom_text(aes(label = round(correlation, 2)), color = "Black", size = 1.5)

Partition Data
set.seed(123)
split <- initial_split(loan, prop = 0.7, strata = loan_status)
train <- training(split)
test <- testing(split)
sprintf("Train PCT : %1.2f%%", nrow(train)/nrow(loan) * 100)
## [1] "Train PCT : 70.00%"
sprintf("Test PCT : %1.2f%%", nrow(test)/nrow(loan) * 100)
## [1] "Test PCT : 30.00%"
loan_fold <- train %>% sample_frac(0.2)
Recipe
loan_recipe <- recipe(loan_status ~ ., data = train) %>%
step_rm(emp_title, url, desc, title, zip_code, earliest_cr_line, last_pymnt_d, addr_state,
issue_d, next_pymnt_d, last_credit_pull_d, id, member_id) %>%
step_impute_median(all_numeric_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
themis::step_downsample(loan_status, under_ratio = 3) %>%
step_nzv(all_nominal_predictors())
Models & Workflows
# -- Logistic Model
log_model <- logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm")
log_workflow_fit <- workflow() %>%
add_recipe(loan_recipe) %>%
add_model(log_model) %>%
fit(train)
# -- XGBoost Model
xgb_model <- boost_tree(trees = 20) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_workflow_fit <- workflow() %>%
add_recipe(loan_recipe) %>%
add_model(xgb_model) %>%
fit(train)
# -- Random Forest Model
rf_model <- rand_forest(trees = 100) %>%
set_engine("ranger", num.thread = 8, importance = "permutation") %>%
set_mode("classification")
rf_workflow_fit <- workflow() %>%
add_recipe(loan_recipe) %>%
add_model(rf_model) %>%
fit(train)
Model Evaluation
evaluate_models <- function(model_workflow, model_name){
# -- make predictions
score_train <- bind_cols(
predict(model_workflow, train, type = "prob"),
predict(model_workflow, train, type = "class"),
train) %>%
mutate(part = "train")
score_test <- bind_cols(
predict(model_workflow, test, type = "prob"),
predict(model_workflow, test, type = "class"),
test) %>%
mutate(part = "test")
options(yardstick.event_first = FALSE)
bind_rows(score_train, score_test) %>%
group_by(part) %>%
metrics(loan_status, .pred_default, estimate = .pred_class) %>%
pivot_wider(id_cols = part, names_from = .metric, values_from = .estimate) %>%
mutate(model_name = model_name) %>% print()
# -- precision
bind_rows(score_train, score_test) %>%
group_by(part) %>%
precision(loan_status, .pred_class) %>%
mutate(model_name = model_name) %>% print()
# -- recall
bind_rows(score_train, score_test) %>%
group_by(part) %>%
recall(loan_status, .pred_class) %>%
mutate(model_name = model_name) %>% print()
# -- roc curve
bind_rows(score_train, score_test) %>%
group_by(part) %>%
roc_curve(truth = loan_status, predicted = .pred_default) %>%
autoplot() +
geom_vline(xintercept = 0.04, color = "red", linetype = "longdash") +
geom_vline(xintercept = 0.20, color = "black", linetype = "longdash") +
labs(title = model_name, x = "FPR(1 - specificity)", y = "TPR(recall)") -> roc_chart
print(roc_chart)
# -- operating range 0 - 10%
operating_range <- score_test %>%
roc_curve(loan_status, .pred_default) %>%
mutate(fpr = 1 - round(specificity, 2),
tpr = round(sensitivity, 3),
threshold = 1 - round(.threshold, 3)) %>%
#dplyr::select(fpr, tpr, threshold) %>%
group_by(fpr) %>%
summarise(threshold = round(mean(threshold), 3),
tpr = mean(tpr)) %>%
#mutate(precision = tpr/(fpr+tpr)) %>%
#mutate_if(is.numeric, round, 2) %>%
arrange(fpr) %>%
filter(fpr <= 0.1)
print(operating_range)
# -- score distribution
score_test %>%
ggplot(aes(.pred_default, fill = loan_status)) +
geom_histogram(bins = 50) +
geom_vline(aes(xintercept = .5, color = "red")) +
geom_vline(aes(xintercept = .3, color = "green")) +
geom_vline(aes(xintercept = .7, color = "blue")) +
labs(title = model_name) -> score_dist
print(score_dist)
# -- variable importance
model_workflow %>%
extract_fit_parsnip() %>%
vip(30) +
labs(model_name) -> vip_model
print(vip_model)
}
evaluate_models(xgb_workflow_fit, "XGBoost Model")
## # A tibble: 2 × 6
## part accuracy kap mn_log_loss roc_auc model_name
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 test 0.875 0.509 0.278 0.906 XGBoost Model
## 2 train 0.898 0.596 0.249 0.934 XGBoost Model
## # A tibble: 2 × 5
## part .metric .estimator .estimate model_name
## <chr> <chr> <chr> <dbl> <chr>
## 1 test precision binary 0.586 XGBoost Model
## 2 train precision binary 0.665 XGBoost Model
## # A tibble: 2 × 5
## part .metric .estimator .estimate model_name
## <chr> <chr> <chr> <dbl> <chr>
## 1 test recall binary 0.579 XGBoost Model
## 2 train recall binary 0.647 XGBoost Model

## # A tibble: 11 × 3
## fpr threshold tpr
## <dbl> <dbl> <dbl>
## 1 0 -Inf 0.0990
## 2 0.0100 0.221 0.236
## 3 0.0200 0.287 0.318
## 4 0.0300 0.346 0.387
## 5 0.0400 0.39 0.446
## 6 0.0500 0.427 0.488
## 7 0.0600 0.458 0.524
## 8 0.0700 0.491 0.567
## 9 0.0800 0.524 0.612
## 10 0.09 0.55 0.645
## 11 0.1 0.572 0.668


evaluate_models(rf_workflow_fit, "Random Forest Model")
## # A tibble: 2 × 6
## part accuracy kap mn_log_loss roc_auc model_name
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 test 0.867 0.420 0.325 0.878 Random Forest Model
## 2 train 0.966 0.872 0.206 0.993 Random Forest Model
## # A tibble: 2 × 5
## part .metric .estimator .estimate model_name
## <chr> <chr> <chr> <dbl> <chr>
## 1 test precision binary 0.580 Random Forest Model
## 2 train precision binary 0.860 Random Forest Model
## # A tibble: 2 × 5
## part .metric .estimator .estimate model_name
## <chr> <chr> <chr> <dbl> <chr>
## 1 test recall binary 0.431 Random Forest Model
## 2 train recall binary 0.925 Random Forest Model

## # A tibble: 11 × 3
## fpr threshold tpr
## <dbl> <dbl> <dbl>
## 1 0 -Inf 0.0559
## 2 0.0100 0.349 0.159
## 3 0.0200 0.402 0.249
## 4 0.0300 0.438 0.318
## 5 0.0400 0.464 0.367
## 6 0.0500 0.488 0.407
## 7 0.0600 0.509 0.453
## 8 0.0700 0.528 0.500
## 9 0.0800 0.548 0.537
## 10 0.09 0.563 0.564
## 11 0.1 0.576 0.590


evaluate_models(log_workflow_fit, "Logistic Model")
## # A tibble: 2 × 6
## part accuracy kap mn_log_loss roc_auc model_name
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 test 0.857 0.426 0.321 0.871 Logistic Model
## 2 train 0.862 0.440 0.315 0.874 Logistic Model
## # A tibble: 2 × 5
## part .metric .estimator .estimate model_name
## <chr> <chr> <chr> <dbl> <chr>
## 1 test precision binary 0.527 Logistic Model
## 2 train precision binary 0.544 Logistic Model
## # A tibble: 2 × 5
## part .metric .estimator .estimate model_name
## <chr> <chr> <chr> <dbl> <chr>
## 1 test recall binary 0.494 Logistic Model
## 2 train recall binary 0.499 Logistic Model

## # A tibble: 11 × 3
## fpr threshold tpr
## <dbl> <dbl> <dbl>
## 1 0 -Inf 0.0247
## 2 0.0100 0.178 0.105
## 3 0.0200 0.267 0.203
## 4 0.0300 0.328 0.284
## 5 0.0400 0.373 0.342
## 6 0.0500 0.413 0.385
## 7 0.0600 0.447 0.426
## 8 0.0700 0.477 0.468
## 9 0.0800 0.502 0.499
## 10 0.09 0.522 0.525
## 11 0.1 0.543 0.554


XGBoost Evaluation
options(yardstick.event_first = FALSE)
# -- training
predict(xgb_workflow_fit, train, type = "prob") %>%
bind_cols(predict(xgb_workflow_fit, train, type = "class")) %>%
mutate(part = "train") %>%
bind_cols(., train) -> xgb_scored_train
# -- testing
predict(xgb_workflow_fit, test, type = "prob") %>%
bind_cols(predict(xgb_workflow_fit, test, type = "class")) %>%
mutate(part = "test") %>%
bind_cols(., test) -> xgb_scored_test
# -- Confusion Matrix
xgb_scored_train %>%
conf_mat(loan_status, .pred_class) %>%
autoplot(type = "heatmap") +
labs(title = "Train Confusion Matrix")

xgb_scored_test %>%
conf_mat(loan_status, .pred_class) %>%
autoplot(type = "heatmap") +
labs(title = "Test Confusion Matrix")

Kaggle
loan_kaggle <- read_csv("loan_holdout.csv", na = c("NA", "", "-")) %>%
mutate(int_rate = as.numeric(str_replace(int_rate, "%", "")),
revol_util = as.numeric(str_replace(revol_util, "%", "")),
empl_year = word(emp_length)) %>%
mutate_if(is.character, as.factor) %>%
clean_names()
## Rows: 12761 Columns: 51
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (22): term, int_rate, grade, sub_grade, emp_title, emp_length, home_owne...
## dbl (29): id, member_id, loan_amnt, funded_amnt, funded_amnt_inv, installmen...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skim(loan_kaggle)
Data summary
| Name |
loan_kaggle |
| Number of rows |
12761 |
| Number of columns |
52 |
| _______________________ |
|
| Column type frequency: |
|
| factor |
21 |
| numeric |
31 |
| ________________________ |
|
| Group variables |
None |
Variable type: factor
| term |
0 |
1.00 |
FALSE |
2 |
36 : 9374, 60 : 3387 |
| grade |
0 |
1.00 |
FALSE |
7 |
B: 3769, A: 3041, C: 2672, D: 1748 |
| sub_grade |
0 |
1.00 |
FALSE |
35 |
B3: 909, B5: 875, A4: 861, A5: 836 |
| emp_title |
804 |
0.94 |
FALSE |
10242 |
US : 44, Ban: 32, AT&: 22, IBM: 21 |
| emp_length |
0 |
1.00 |
FALSE |
12 |
10+: 2792, < 1: 1571, 2 y: 1430, 3 y: 1356 |
| home_ownership |
0 |
1.00 |
FALSE |
5 |
REN: 6117, MOR: 5619, OWN: 976, OTH: 45 |
| verification_status |
0 |
1.00 |
FALSE |
3 |
Not: 5630, Ver: 4011, Sou: 3120 |
| issue_d |
0 |
1.00 |
FALSE |
55 |
Dec: 686, Nov: 664, Oct: 651, Sep: 614 |
| pymnt_plan |
0 |
1.00 |
FALSE |
1 |
n: 12761 |
| url |
0 |
1.00 |
FALSE |
12761 |
htt: 1, htt: 1, htt: 1, htt: 1 |
| desc |
4088 |
0.68 |
FALSE |
8664 |
Deb: 4, Cam: 3, con: 2, con: 2 |
| purpose |
0 |
1.00 |
FALSE |
14 |
deb: 5960, cre: 1627, oth: 1317, hom: 973 |
| title |
1 |
1.00 |
FALSE |
7073 |
Deb: 702, Deb: 536, Per: 194, Con: 177 |
| zip_code |
0 |
1.00 |
FALSE |
760 |
100: 191, 900: 161, 112: 154, 945: 152 |
| addr_state |
0 |
1.00 |
FALSE |
50 |
CA: 2241, NY: 1229, FL: 904, TX: 848 |
| earliest_cr_line |
9 |
1.00 |
FALSE |
474 |
Oct: 117, Dec: 115, Oct: 115, Dec: 110 |
| last_pymnt_d |
19 |
1.00 |
FALSE |
105 |
Mar: 324, Dec: 313, Apr: 284, May: 281 |
| next_pymnt_d |
11817 |
0.07 |
FALSE |
88 |
Oct: 163, Apr: 29, Feb: 29, Mar: 28 |
| last_credit_pull_d |
2 |
1.00 |
FALSE |
107 |
Sep: 4908, Mar: 277, Aug: 233, Jul: 196 |
| application_type |
0 |
1.00 |
FALSE |
1 |
IND: 12761 |
| empl_year |
0 |
1.00 |
FALSE |
12 |
10+: 2792, <: 1571, 2: 1430, 3: 1356 |
Variable type: numeric
| id |
0 |
1.00 |
668251.54 |
217909.00 |
55521.00 |
502532.00 |
651269.00 |
826140.00 |
1077175.00 |
▁▃▇▇▅ |
| member_id |
0 |
1.00 |
830682.55 |
277425.28 |
70626.00 |
645610.00 |
832879.00 |
1034801.00 |
1313524.00 |
▁▃▇▇▆ |
| loan_amnt |
0 |
1.00 |
11043.73 |
7425.67 |
500.00 |
5000.00 |
9600.00 |
15000.00 |
35000.00 |
▇▆▃▂▁ |
| funded_amnt |
0 |
1.00 |
10770.14 |
7146.61 |
500.00 |
5000.00 |
9500.00 |
15000.00 |
35000.00 |
▇▇▃▂▁ |
| funded_amnt_inv |
0 |
1.00 |
10116.91 |
7133.85 |
0.00 |
4903.99 |
8400.00 |
14000.00 |
35000.00 |
▇▆▃▁▁ |
| int_rate |
0 |
1.00 |
12.16 |
3.69 |
5.42 |
9.62 |
11.99 |
14.72 |
24.59 |
▅▇▆▂▁ |
| installment |
0 |
1.00 |
319.86 |
206.93 |
16.85 |
164.46 |
274.63 |
424.60 |
1269.73 |
▇▆▂▁▁ |
| annual_inc |
3 |
1.00 |
68985.62 |
57925.95 |
1896.00 |
40000.00 |
59000.00 |
82435.00 |
2039784.00 |
▇▁▁▁▁ |
| dti |
0 |
1.00 |
13.35 |
6.70 |
0.00 |
8.23 |
13.43 |
18.63 |
29.96 |
▅▇▇▆▁ |
| delinq_2yrs |
9 |
1.00 |
0.15 |
0.48 |
0.00 |
0.00 |
0.00 |
0.00 |
9.00 |
▇▁▁▁▁ |
| fico_range_low |
0 |
1.00 |
713.05 |
35.90 |
620.00 |
685.00 |
710.00 |
740.00 |
825.00 |
▁▇▇▃▁ |
| fico_range_high |
0 |
1.00 |
717.05 |
35.90 |
624.00 |
689.00 |
714.00 |
744.00 |
829.00 |
▁▇▇▃▁ |
| inq_last_6mths |
9 |
1.00 |
1.08 |
1.49 |
0.00 |
0.00 |
1.00 |
2.00 |
27.00 |
▇▁▁▁▁ |
| mths_since_last_delinq |
8022 |
0.37 |
35.71 |
22.64 |
0.00 |
17.00 |
34.00 |
52.00 |
115.00 |
▇▇▅▂▁ |
| mths_since_last_record |
11679 |
0.08 |
59.06 |
47.00 |
0.00 |
0.00 |
85.00 |
101.00 |
120.00 |
▇▁▂▅▇ |
| open_acc |
9 |
1.00 |
9.36 |
4.46 |
1.00 |
6.00 |
9.00 |
12.00 |
36.00 |
▇▇▂▁▁ |
| pub_rec |
9 |
1.00 |
0.06 |
0.24 |
0.00 |
0.00 |
0.00 |
0.00 |
4.00 |
▇▁▁▁▁ |
| revol_bal |
0 |
1.00 |
14269.54 |
20349.43 |
0.00 |
3625.00 |
8903.00 |
17345.00 |
602519.00 |
▇▁▁▁▁ |
| revol_util |
26 |
1.00 |
49.19 |
28.45 |
0.00 |
25.50 |
50.10 |
72.80 |
105.30 |
▇▇▇▇▅ |
| total_acc |
9 |
1.00 |
22.22 |
11.60 |
1.00 |
14.00 |
21.00 |
29.00 |
90.00 |
▇▇▂▁▁ |
| out_prncp |
0 |
1.00 |
11.97 |
129.59 |
0.00 |
0.00 |
0.00 |
0.00 |
3555.85 |
▇▁▁▁▁ |
| out_prncp_inv |
0 |
1.00 |
11.94 |
129.24 |
0.00 |
0.00 |
0.00 |
0.00 |
3553.30 |
▇▁▁▁▁ |
| total_rec_late_fee |
0 |
1.00 |
1.54 |
8.07 |
0.00 |
0.00 |
0.00 |
0.00 |
209.00 |
▇▁▁▁▁ |
| last_pymnt_amnt |
0 |
1.00 |
2606.45 |
4412.08 |
0.00 |
212.04 |
518.35 |
3162.89 |
35471.86 |
▇▁▁▁▁ |
| collections_12_mths_ex_med |
44 |
1.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
▁▁▇▁▁ |
| policy_code |
0 |
1.00 |
1.00 |
0.00 |
1.00 |
1.00 |
1.00 |
1.00 |
1.00 |
▁▁▇▁▁ |
| acc_now_delinq |
9 |
1.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
▁▁▇▁▁ |
| chargeoff_within_12_mths |
44 |
1.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
▁▁▇▁▁ |
| delinq_amnt |
9 |
1.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
▁▁▇▁▁ |
| pub_rec_bankruptcies |
402 |
0.97 |
0.04 |
0.21 |
0.00 |
0.00 |
0.00 |
0.00 |
2.00 |
▇▁▁▁▁ |
| tax_liens |
29 |
1.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
▁▁▇▁▁ |
predict(xgb_workflow_fit, loan_kaggle, type = "prob") %>%
bind_cols(.,loan_kaggle) %>%
dplyr::select(id, loan_status = .pred_default) %>%
write_csv("final_kaggle_submission.csv")
Top Predictions
top_tp <- xgb_scored_test %>%
filter(.pred_class == loan_status) %>%
filter(loan_status == "default") %>%
slice_max(order_by = .pred_default, n = 10)
top_tn <- xgb_scored_test %>%
filter(.pred_class == loan_status) %>%
filter(loan_status == "current") %>%
slice_max(order_by = .pred_default, n = 10)
top_fp <- xgb_scored_test %>%
filter(.pred_class != loan_status) %>%
filter(loan_status == "current") %>%
slice_max(order_by = .pred_default, n = 10)
bottom_fn <- xgb_scored_test %>%
filter(.pred_class != loan_status) %>%
filter(loan_status == "default") %>%
slice_min(order_by = .pred_default, n = 10)
Partial Dependence Plot
# -- try step_profile
grid <- recipe(loan_status ~ ., data = train) %>%
step_profile(all_predictors(), -last_pymnt_amnt, profile = vars(last_pymnt_amnt)) %>%
prep() %>% juice()
predict(xgb_workflow_fit, grid, type = "prob") %>%
bind_cols(grid %>% dplyr::select(last_pymnt_amnt)) %>%
ggplot(aes(y = .pred_default, x = last_pymnt_amnt)) +
geom_path() + stat_smooth() +
labs(title = "Partial Dependence Plot - Last Payment Amount")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# -- create exlainer object
xgb_explainer <- explain_tidymodels(xgb_workflow_fit, data = train,
y = train$loan_status, verbose = TRUE)
## Preparation of a new explainer is initiated
## -> model label : workflow ( default )
## -> data : 20843 rows 53 cols
## -> data : tibble converted into a data.frame
## -> target variable : 20843 values
## -> predict function : yhat.workflow will be used ( default )
## -> predicted values : No value for predict function target column. ( default )
## -> model_info : package tidymodels , ver. 1.0.0 , task classification ( default )
## -> model_info : Model info detected classification task but 'y' is a factor . ( WARNING )
## -> model_info : By deafult classification tasks supports only numercical 'y' parameter.
## -> model_info : Consider changing to numerical vector with 0 and 1 values.
## -> model_info : Otherwise I will not be able to calculate residuals or loss function.
## -> predicted values : numerical, min = 0.001366913 , mean = 0.2094293 , max = 0.9826499
## -> residual function : difference between y and yhat ( default )
## -> residuals : numerical, min = NA , mean = NA , max = NA
## A new explainer has been created!
# -- profile the variable of interest
pdp_last_pymnt_amnt <- model_profile(xgb_explainer, variables = "last_pymnt_amnt")
# -- plot it
plot(pdp_last_pymnt_amnt) + labs(title = "Partial Dependence Plot",
subtitle = " ",
x = "last_pymnt_amnt",
y = "Average Impact on Prediction")

# -- PDP: late fees received to date
pdp_total_rec_late_fee <- model_profile(xgb_explainer, variables = "total_rec_late_fee")
plot(pdp_total_rec_late_fee) + labs(title = "Partial Dependence Plot",
subtitle = " ",
x = "Late Fees Received to Date",
y = "Average Impact on Prediction")

# -- PDP: interest rate on the loan
pdp_int_rate <- model_profile(xgb_explainer, variables = "int_rate")
plot(pdp_int_rate) + labs(title = "Partial Dependence Plot",
subtitle = " ",
x = "Interest Rate on the Loan",
y = "Average Impact on Prediction")

# -- PDP: monthly payment owed by the borrower if the loan originates
pdp_installment <- model_profile(xgb_explainer, variables = "installment")
plot(pdp_installment) + labs(title = "Partial Dependence Plot",
subtitle = " ",
x = "Monthly Payment Owed by Borrower (if the loan originates)",
y = "Average Impact on Prediction")

# -- PDP: remaining outstanding principal for total amount funded
pdp_out_prncp <- model_profile(xgb_explainer, variables = "out_prncp")
plot(pdp_out_prncp) + labs(title = "Partial Dependence Plot",
subtitle = " ",
x = "Remaining Outstanding Principal",
y = "Average Impact on Prediction")

# -- PDP: total amount committed by investors for that loan at that point in time
pdp_funded_amnt_inv <- model_profile(xgb_explainer, variables = "funded_amnt_inv")
plot(pdp_funded_amnt_inv) + labs(title = "Partial Dependence Plot",
subtitle = " ",
x = "Total Amount Committed by Investors",
y = "Average Impact on Prediction")

# -- PDP: self-reported annual income provided by the borrower during registration
pdp_annual_inc <- model_profile(xgb_explainer, variables = "annual_inc")
plot(pdp_annual_inc) + labs(title = "Partial Dependence Plot",
subtitle = " ",
x = "Borrower's Annual Income",
y = "Average Impact on Prediction")

# -- PDP: listed amount of the loan applied for by the borrower
pdp_loan_amnt <- model_profile(xgb_explainer, variables = "loan_amnt")
plot(pdp_loan_amnt) + labs(title = "Partial Dependence Plot",
subtitle = " ",
x = "Amount of Loan",
y = "Average Impact on Prediction")

Shap & Breakdown Plots
tidy_explainer <- explain_tidymodels(xgb_workflow_fit, data = test,
y = test$loan_status, label = "tidymodels")
## Preparation of a new explainer is initiated
## -> model label : tidymodels
## -> data : 8934 rows 53 cols
## -> data : tibble converted into a data.frame
## -> target variable : 8934 values
## -> predict function : yhat.workflow will be used ( default )
## -> predicted values : No value for predict function target column. ( default )
## -> model_info : package tidymodels , ver. 1.0.0 , task classification ( default )
## -> model_info : Model info detected classification task but 'y' is a factor . ( WARNING )
## -> model_info : By deafult classification tasks supports only numercical 'y' parameter.
## -> model_info : Consider changing to numerical vector with 0 and 1 values.
## -> model_info : Otherwise I will not be able to calculate residuals or loss function.
## -> predicted values : numerical, min = 0.001402915 , mean = 0.2128489 , max = 0.9764078
## -> residual function : difference between y and yhat ( default )
## Warning in Ops.factor(y, predict_function(model, data)): '-' not meaningful for
## factors
## -> residuals : numerical, min = NA , mean = NA , max = NA
## A new explainer has been created!
shap_explain <- predict_parts(tidy_explainer, top_tp %>% head(1), type = "shap")
plot(shap_explain)

score <- 0.67
as_tibble(shap_explain) %>%
group_by(variable) %>%
summarise(contribution = sum(contribution)) %>%
top_n(wt = abs(contribution), 10) %>%
mutate(pos_neg = if_else(contribution < 0, "neg", "pos")) %>%
arrange(desc(contribution)) %>%
ggplot(aes(x = contribution, y = reorder(variable, contribution), fill = pos_neg)) +
geom_col() +
labs(title = paste("Shap Explainer, Predicted Score:", score))

breakdown_explain <- predict_parts(tidy_explainer, top_tp %>% head(1),
type = "break_down_interactions")
plot(breakdown_explain) + labs(title = paste("Breakdown Plot, Predicted Score:", score))

# -- true postive
breakdown_explainer <- function(row){
breakdown_explain <- predict_parts(tidy_explainer, row, type = "break_down_interactions")
plot(breakdown_explain) +
labs(title = paste("Breakdown Plot, Predicted Score:", round(row$.pred_default, 3)))
}
for (row in 1:2){
dat <- top_tp[row,]
print(breakdown_explainer(dat))
}


for (row in 3:3){
dat <- top_tp[row,]
print(breakdown_explainer(dat))
}

# -- false positive
shap_explain <- predict_parts(tidy_explainer, top_fp %>% head(1), type = "shap")
plot(shap_explain)

as_tibble(shap_explain) %>%
group_by(variable) %>%
summarise(contribution = sum(contribution)) %>%
top_n(wt = abs(contribution), 10) %>%
mutate(pos_neg = if_else(contribution < 0, "neg", "pos")) %>%
arrange(desc(contribution)) %>%
ggplot(aes(x = contribution, y = reorder(variable, contribution), fill = pos_neg)) +
geom_col() +
labs(title = paste("Shap Explainer, Predicted Score:", score))

for (row in 1:3){
dat <- top_fp[row,]
print(breakdown_explainer(dat))
}



# -- true negative
shap_explain <- predict_parts(tidy_explainer, top_tn %>% head(1), type = "shap")
plot(shap_explain)

as_tibble(shap_explain) %>%
group_by(variable) %>%
summarise(contribution = sum(contribution)) %>%
top_n(wt = abs(contribution), 10) %>%
mutate(pos_neg = if_else(contribution < 0, "neg", "pos")) %>%
arrange(desc(contribution)) %>%
ggplot(aes(x = contribution, y = reorder(variable, contribution), fill = pos_neg)) +
geom_col() +
labs(title = paste("Shap Explainer, Predicted Score:", score))

for (row in 1:3){
dat <- top_tn[row,]
print(breakdown_explainer(dat))
}



# -- false negative
shap_explain <- predict_parts(tidy_explainer, bottom_fn %>% head(1), type = "shap")
plot(shap_explain)

as_tibble(shap_explain) %>%
group_by(variable) %>%
summarise(contribution = sum(contribution)) %>%
top_n(wt = abs(contribution), 10) %>%
mutate(pos_neg = if_else(contribution < 0, "neg", "pos")) %>%
arrange(desc(contribution)) %>%
ggplot(aes(x = contribution, y = reorder(variable, contribution), fill = pos_neg)) +
geom_col() +
labs(title = paste("Shap Explainer, Predicted Score:", score))

for (row in 1:3){
dat <- bottom_fn[row,]
print(breakdown_explainer(dat))
}



Isolation Recipe
# -- deal w. categorical
iso_recipe <- recipe(~.,loan) %>%
step_rm(id,member_id,emp_title,issue_d,url,desc,title,zip_code,earliest_cr_line,last_pymnt_d,
next_pymnt_d,last_credit_pull_d) %>%
step_unknown(all_nominal_predictors()) %>%
step_novel(all_nominal_predictors()) %>%
step_impute_median(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
iso_prep <- bake(iso_recipe %>% prep(), loan)
# -- deal w. numeric
iso_numeric <- loan %>% select_if(is.numeric)
iso_recipe <- recipe(~.,iso_numeric) %>%
step_rm(id, member_id) %>%
step_impute_median(all_numeric()) %>%
prep()
Isolation Forest
iso_forest <- isolationForest$new(sample_size = 256,
num_trees = 100,
max_depth = ceiling(log2(256)))
iso_forest$fit(iso_prep)
## INFO [17:13:11.288] dataset has duplicated rows
## INFO [17:13:11.354] Building Isolation Forest ...
## INFO [17:13:12.906] done
## INFO [17:13:12.908] Computing depth of terminal nodes ...
## INFO [17:13:13.333] done
## INFO [17:13:15.124] Completed growing isolation forest
Predict Trainning
pred_train <- iso_forest$predict(iso_prep)
pred_train %>%
summarise(n = n(),
min = min(average_depth),
max = max(average_depth),
mean = mean(average_depth),
min_score = min(anomaly_score),
max_score = max(anomaly_score),
mean_score = mean(anomaly_score))
# -- average tree depth
pred_train %>%
ggplot(aes(average_depth)) +
geom_histogram(bins=20) +
geom_vline(xintercept = 7.4, linetype = "dotted",
color = "blue", size = 1.5) +
labs(title = "Isolation Forest Tree Depth")

# -- average anomaly score
pred_train %>%
ggplot(aes(anomaly_score)) +
geom_histogram(bins=20) +
geom_vline(xintercept = 0.607, linetype = "dotted",
color = "blue", size = 1.5) +
labs(title = "Isolation Forest Anomaly Score")

Create a Data Frame with Anomaly Flag
train_pred <- bind_cols(iso_forest$predict(iso_prep), iso_prep) %>%
mutate(anomaly = as.factor(if_else(average_depth <= 7.4, "Anomaly", "Normal")))
train_pred %>%
arrange(average_depth) %>%
count(anomaly)
head(train_pred, 10)
Fit a Tree
fmla <- as.formula(paste("anomaly ~ ", paste(iso_prep %>% colnames(), collapse= "+")))
outlier_tree <- decision_tree(min_n = 2, tree_depth = 3,
cost_complexity = .01) %>%
set_mode("classification") %>%
set_engine("rpart") %>%
fit(fmla, data = train_pred)
outlier_tree$fit
## n= 29777
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 29777 236 Normal (0.007925580 0.992074420)
## 2) addr_state_AZ>=0.5 667 56 Normal (0.083958021 0.916041979)
## 4) int_rate>=18.63 41 20 Normal (0.487804878 0.512195122)
## 8) last_pymnt_amnt>=692.905 26 7 Anomaly (0.730769231 0.269230769) *
## 9) last_pymnt_amnt< 692.905 15 1 Normal (0.066666667 0.933333333) *
## 5) int_rate< 18.63 626 36 Normal (0.057507987 0.942492013) *
## 3) addr_state_AZ< 0.5 29110 180 Normal (0.006183442 0.993816558) *
# -- plotting decision trees
library(rpart.plot)
rpart.plot(outlier_tree$fit, clip.right.labs = FALSE,
branch = .3, under = TRUE, roundint = FALSE, extra = 3)

Global Anomaly Rules
Convert into English rules
anomaly_rules <- rpart.rules(outlier_tree$fit, roundint = FALSE,
extra = 4, cover = TRUE, clip.facs = TRUE) %>%
clean_names() %>%
mutate(rule = "IF")
rule_cols <- anomaly_rules %>% dplyr::select(starts_with("x_")) %>% colnames()
for (col in rule_cols){anomaly_rules <- anomaly_rules %>%
mutate(rule = paste(rule, !!as.name(col)))
}
anomaly_rules %>%
as.data.frame() %>%
filter(anomaly == "Anomaly") %>%
mutate(rule = paste(rule, " THEN ", anomaly)) %>%
mutate(rule = paste(rule, " coverage ", cover)) %>%
dplyr::select(rule)
anomaly_rules %>%
as.data.frame() %>%
filter(anomaly == "Normal") %>%
mutate(rule = paste(rule, " THEN ", anomaly)) %>%
mutate(rule = paste(rule, " coverage ", cover)) %>%
dplyr::select(rule)
pred_train <- bind_cols(iso_forest$predict(iso_prep), iso_prep)
pred_train %>%
arrange(desc(anomaly_score)) %>%
filter(average_depth <= 7.4)
Local Anomaly Rules
fmla <- as.formula(paste("anomaly ~ ", paste(iso_prep %>% colnames(), collapse= "+")))
# -- identify observation as anomaly
pred_train %>%
mutate(anomaly = as.factor(if_else(id==28006, "Anomaly", "Normal"))) -> local_df
local_tree <- decision_tree(mode = "classification", tree_depth = 4,
min_n = 1, cost_complexity = 0.01) %>%
set_engine("rpart") %>%
fit(fmla, local_df)
local_tree$fit
## n= 29777
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
## 2) addr_state_OH>=0.5 912 1 Normal (1.096491e-03 9.989035e-01)
## 4) annual_inc>=159998 21 1 Normal (4.761905e-02 9.523810e-01)
## 8) sub_grade_C2>=0.5 1 0 Anomaly (1.000000e+00 0.000000e+00) *
## 9) sub_grade_C2< 0.5 20 0 Normal (0.000000e+00 1.000000e+00) *
## 5) annual_inc< 159998 891 0 Normal (0.000000e+00 1.000000e+00) *
## 3) addr_state_OH< 0.5 28865 0 Normal (0.000000e+00 1.000000e+00) *
rpart.rules(local_tree$fit, extra = 4, cover = TRUE, clip.facs = TRUE, roundint = FALSE)
rpart.plot(local_tree$fit, roundint = FALSE, extra = 3)

anomaly_rules <- rpart.rules(local_tree$fit, extra = 4, cover = TRUE, clip.facs = TRUE) %>%
clean_names() %>%
filter(anomaly == "Anomaly") %>%
mutate(rule = "IF")
rule_cols <- anomaly_rules %>% dplyr::select(starts_with("x_")) %>% colnames()
for (col in rule_cols){
anomaly_rules <- anomaly_rules %>%
mutate(rule = paste(rule, !!as.name(col)))
}
as.data.frame(anomaly_rules) %>% dplyr::select(rule, cover)
local_explainer <- function(ID){
fmla <- as.formula(paste("anomaly ~ ", paste(iso_prep %>% colnames(), collapse= "+")))
pred_train %>%
mutate(anomaly = as.factor(if_else(id==ID, "Anomaly", "Normal"))) -> local_df
local_tree <- decision_tree(mode="classification",
tree_depth = 3,
min_n = 1,
cost_complexity = 0) %>%
set_engine("rpart") %>%
fit(fmla, local_df)
local_tree$fit
rpart.plot(local_tree$fit, roundint = FALSE, extra = 3) %>% print()
anomaly_rules <- rpart.rules(local_tree$fit, extra = 4, cover = TRUE, clip.facs = TRUE) %>%
clean_names() %>%
filter(anomaly == "Anomaly") %>%
mutate(rule = "IF")
rule_cols <- anomaly_rules %>% dplyr::select(starts_with("x_")) %>% colnames()
for (col in rule_cols){
anomaly_rules <- anomaly_rules %>%
mutate(rule = paste(rule, !!as.name(col)))
}
as.data.frame(anomaly_rules) %>%
dplyr::select(rule, cover) %>%
print()
}
pred_train %>%
slice_max(order_by = anomaly_score, n = 5) %>%
pull(id) -> anomaly_vect
for (anomaly_id in anomaly_vect){
print(anomaly_id)
local_explainer(anomaly_id)
}
## [1] 12024
## $obj
## n= 29777
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
## 2) pub_rec>=2.5 13 1 Normal (7.692308e-02 9.230769e-01)
## 4) int_rate< 10.73 1 0 Anomaly (1.000000e+00 0.000000e+00) *
## 5) int_rate>=10.73 12 0 Normal (0.000000e+00 1.000000e+00) *
## 3) pub_rec< 2.5 29764 0 Normal (0.000000e+00 1.000000e+00) *
##
## $snipped.nodes
## NULL
##
## $xlim
## [1] -0.2 1.2
##
## $ylim
## [1] -0.15 1.15
##
## $x
## [1] 0.60738129 0.27793723 0.05830786 0.49756661 0.93682535
##
## $y
## [1] 0.93276038 0.52435976 0.03427902 0.03427902 0.03427902
##
## $branch.x
## [,1] [,2] [,3] [,4] [,5]
## x 0.6073813 0.2779372 0.05830786 0.4975666 0.9368254
## NA 0.2779372 0.05830786 0.4975666 0.9368254
## NA 0.6073813 0.27793723 0.2779372 0.6073813
##
## $branch.y
## [,1] [,2] [,3] [,4] [,5]
## y 1.02051 0.6121098 0.1220290 0.1220290 0.1220290
## NA 0.8335647 0.4251641 0.4251641 0.8335647
## NA 0.8335647 0.4251641 0.4251641 0.8335647
##
## $labs
## [1] "Normal\n1 / 29777" "Normal\n1 / 13" "Anomaly\n0 / 1"
## [4] "Normal\n0 / 12" "Normal\n0 / 29764"
##
## $cex
## [1] 1
##
## $boxes
## $boxes$x1
## [1] 0.51073356 0.20097700 -0.03058419 0.42060637 0.84017762
##
## $boxes$y1
## [1] 0.88316255 0.47476193 -0.01531881 -0.01531881 -0.01531881
##
## $boxes$x2
## [1] 0.7040290 0.3548975 0.1471999 0.5745268 1.0334731
##
## $boxes$y2
## [1] 1.0205104 0.6121098 0.1220290 0.1220290 0.1220290
##
##
## $split.labs
## [1] ""
##
## $split.cex
## [1] 1 1 1 1 1
##
## $split.box
## $split.box$x1
## [1] 0.4570404 0.1478804 NA NA NA
##
## $split.box$y1
## [1] 0.7954125 0.3870119 NA NA NA
##
## $split.box$x2
## [1] 0.7577222 0.4079941 NA NA NA
##
## $split.box$y2
## [1] 0.8717169 0.4633163 NA NA NA
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.rules with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.

## rule cover
## 4 IF pub_rec >= 2.5 & int_rate < 11 0%
## [1] 5259
## $obj
## n= 29777
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
## 2) last_pymnt_amnt>=34536.97 12 1 Normal (8.333333e-02 9.166667e-01)
## 4) annual_inc>=446500 1 0 Anomaly (1.000000e+00 0.000000e+00) *
## 5) annual_inc< 446500 11 0 Normal (0.000000e+00 1.000000e+00) *
## 3) last_pymnt_amnt< 34536.97 29765 0 Normal (0.000000e+00 1.000000e+00) *
##
## $snipped.nodes
## NULL
##
## $xlim
## [1] -0.2 1.2
##
## $ylim
## [1] -0.15 1.15
##
## $x
## [1] 0.60738129 0.27793723 0.05830786 0.49756661 0.93682535
##
## $y
## [1] 0.93276038 0.52435976 0.03427902 0.03427902 0.03427902
##
## $branch.x
## [,1] [,2] [,3] [,4] [,5]
## x 0.6073813 0.2779372 0.05830786 0.4975666 0.9368254
## NA 0.2779372 0.05830786 0.4975666 0.9368254
## NA 0.6073813 0.27793723 0.2779372 0.6073813
##
## $branch.y
## [,1] [,2] [,3] [,4] [,5]
## y 1.02051 0.6121098 0.1220290 0.1220290 0.1220290
## NA 0.8335647 0.4251641 0.4251641 0.8335647
## NA 0.8335647 0.4251641 0.4251641 0.8335647
##
## $labs
## [1] "Normal\n1 / 29777" "Normal\n1 / 12" "Anomaly\n0 / 1"
## [4] "Normal\n0 / 11" "Normal\n0 / 29765"
##
## $cex
## [1] 1
##
## $boxes
## $boxes$x1
## [1] 0.51073356 0.20097700 -0.03058419 0.42060637 0.84017762
##
## $boxes$y1
## [1] 0.88316255 0.47476193 -0.01531881 -0.01531881 -0.01531881
##
## $boxes$x2
## [1] 0.7040290 0.3548975 0.1471999 0.5745268 1.0334731
##
## $boxes$y2
## [1] 1.0205104 0.6121098 0.1220290 0.1220290 0.1220290
##
##
## $split.labs
## [1] ""
##
## $split.cex
## [1] 1 1 1 1 1
##
## $split.box
## $split.box$x1
## [1] 0.34667104 0.06316449 NA NA NA
##
## $split.box$y1
## [1] 0.7954125 0.3870119 NA NA NA
##
## $split.box$x2
## [1] 0.8680915 0.4927100 NA NA NA
##
## $split.box$y2
## [1] 0.8717169 0.4633163 NA NA NA
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.rules with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.

## rule cover
## 4 IF last_pymnt_amnt >= 34537 & annual_inc >= 446500 0%
## [1] 5822
## $obj
## n= 29777
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
## 2) int_rate>=22.415 69 1 Normal (1.449275e-02 9.855072e-01)
## 4) dti< 1.29 1 0 Anomaly (1.000000e+00 0.000000e+00) *
## 5) dti>=1.29 68 0 Normal (0.000000e+00 1.000000e+00) *
## 3) int_rate< 22.415 29708 0 Normal (0.000000e+00 1.000000e+00) *
##
## $snipped.nodes
## NULL
##
## $xlim
## [1] -0.2 1.2
##
## $ylim
## [1] -0.15 1.15
##
## $x
## [1] 0.60738129 0.27793723 0.05830786 0.49756661 0.93682535
##
## $y
## [1] 0.93276038 0.52435976 0.03427902 0.03427902 0.03427902
##
## $branch.x
## [,1] [,2] [,3] [,4] [,5]
## x 0.6073813 0.2779372 0.05830786 0.4975666 0.9368254
## NA 0.2779372 0.05830786 0.4975666 0.9368254
## NA 0.6073813 0.27793723 0.2779372 0.6073813
##
## $branch.y
## [,1] [,2] [,3] [,4] [,5]
## y 1.02051 0.6121098 0.1220290 0.1220290 0.1220290
## NA 0.8335647 0.4251641 0.4251641 0.8335647
## NA 0.8335647 0.4251641 0.4251641 0.8335647
##
## $labs
## [1] "Normal\n1 / 29777" "Normal\n1 / 69" "Anomaly\n0 / 1"
## [4] "Normal\n0 / 68" "Normal\n0 / 29708"
##
## $cex
## [1] 1
##
## $boxes
## $boxes$x1
## [1] 0.51073356 0.20097700 -0.03058419 0.42060637 0.84017762
##
## $boxes$y1
## [1] 0.88316255 0.47476193 -0.01531881 -0.01531881 -0.01531881
##
## $boxes$x2
## [1] 0.7040290 0.3548975 0.1471999 0.5745268 1.0334731
##
## $boxes$y2
## [1] 1.0205104 0.6121098 0.1220290 0.1220290 0.1220290
##
##
## $split.labs
## [1] ""
##
## $split.cex
## [1] 1 1 1 1 1
##
## $split.box
## $split.box$x1
## [1] 0.4659892 0.1890452 NA NA NA
##
## $split.box$y1
## [1] 0.7954125 0.3870119 NA NA NA
##
## $split.box$x2
## [1] 0.7487733 0.3668293 NA NA NA
##
## $split.box$y2
## [1] 0.8717169 0.4633163 NA NA NA
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.rules with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.

## rule cover
## 4 IF int_rate >= 22 & dti < 1.3 0%
## [1] 3880
## $obj
## n= 29777
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
## 2) out_prncp>=1127.87 120 1 Normal (8.333333e-03 9.916667e-01)
## 4) out_prncp< 1131.13 1 0 Anomaly (1.000000e+00 0.000000e+00) *
## 5) out_prncp>=1131.13 119 0 Normal (0.000000e+00 1.000000e+00) *
## 3) out_prncp< 1127.87 29657 0 Normal (0.000000e+00 1.000000e+00) *
##
## $snipped.nodes
## NULL
##
## $xlim
## [1] -0.2 1.2
##
## $ylim
## [1] -0.15 1.15
##
## $x
## [1] 0.60738129 0.27793723 0.05830786 0.49756661 0.93682535
##
## $y
## [1] 0.93276038 0.52435976 0.03427902 0.03427902 0.03427902
##
## $branch.x
## [,1] [,2] [,3] [,4] [,5]
## x 0.6073813 0.2779372 0.05830786 0.4975666 0.9368254
## NA 0.2779372 0.05830786 0.4975666 0.9368254
## NA 0.6073813 0.27793723 0.2779372 0.6073813
##
## $branch.y
## [,1] [,2] [,3] [,4] [,5]
## y 1.02051 0.6121098 0.1220290 0.1220290 0.1220290
## NA 0.8335647 0.4251641 0.4251641 0.8335647
## NA 0.8335647 0.4251641 0.4251641 0.8335647
##
## $labs
## [1] "Normal\n1 / 29777" "Normal\n1 / 120" "Anomaly\n0 / 1"
## [4] "Normal\n0 / 119" "Normal\n0 / 29657"
##
## $cex
## [1] 1
##
## $boxes
## $boxes$x1
## [1] 0.51073356 0.20097700 -0.03058419 0.42060637 0.84017762
##
## $boxes$y1
## [1] 0.88316255 0.47476193 -0.01531881 -0.01531881 -0.01531881
##
## $boxes$x2
## [1] 0.7040290 0.3548975 0.1471999 0.5745268 1.0334731
##
## $boxes$y2
## [1] 1.0205104 0.6121098 0.1220290 0.1220290 0.1220290
##
##
## $split.labs
## [1] ""
##
## $split.cex
## [1] 1 1 1 1 1
##
## $split.box
## $split.box$x1
## [1] 0.4224381 0.1043293 NA NA NA
##
## $split.box$y1
## [1] 0.7954125 0.3870119 NA NA NA
##
## $split.box$x2
## [1] 0.7923245 0.4515452 NA NA NA
##
## $split.box$y2
## [1] 0.8717169 0.4633163 NA NA NA
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.rules with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.

## rule cover
## 4 IF out_prncp is 1128 to 1131 0%
## [1] 28010
## $obj
## n= 29777
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
## 2) sub_grade_E5>=0.5 356 1 Normal (2.808989e-03 9.971910e-01)
## 4) emp_length_X9.years>=0.5 12 1 Normal (8.333333e-02 9.166667e-01)
## 8) inq_last_6mths>=3.5 1 0 Anomaly (1.000000e+00 0.000000e+00) *
## 9) inq_last_6mths< 3.5 11 0 Normal (0.000000e+00 1.000000e+00) *
## 5) emp_length_X9.years< 0.5 344 0 Normal (0.000000e+00 1.000000e+00) *
## 3) sub_grade_E5< 0.5 29421 0 Normal (0.000000e+00 1.000000e+00) *
##
## $snipped.nodes
## NULL
##
## $xlim
## [1] 0 1
##
## $ylim
## [1] 0 1
##
## $x
## [1] 0.68059108 0.42435682 0.20472744 0.05830786 0.35114702 0.64398619 0.93682535
##
## $y
## [1] 0.93276038 0.66049330 0.38822622 0.03427902 0.03427902 0.03427902 0.03427902
##
## $branch.x
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## x 0.6805911 0.4243568 0.2047274 0.05830786 0.3511470 0.6439862 0.9368254
## NA 0.4243568 0.2047274 0.05830786 0.3511470 0.6439862 0.9368254
## NA 0.6805911 0.4243568 0.20472744 0.2047274 0.4243568 0.6805911
##
## $branch.y
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## y 1.00026 0.7279933 0.4557262 0.1017790 0.1017790 0.1017790 0.101779
## NA 0.8564560 0.5841889 0.3119219 0.3119219 0.5841889 0.856456
## NA 0.8564560 0.5841889 0.3119219 0.3119219 0.5841889 0.856456
##
## $labs
## [1] "Normal\n1 / 29777" "Normal\n1 / 356" "Normal\n1 / 12"
## [4] "Anomaly\n0 / 1" "Normal\n0 / 11" "Normal\n0 / 344"
## [7] "Normal\n0 / 29421"
##
## $cex
## [1] 1
##
## $boxes
## $boxes$x1
## [1] 0.611556988 0.369385220 0.149755847 -0.005186464 0.296175429
## [6] 0.589014593 0.867791256
##
## $boxes$y1
## [1] 0.894608204 0.622341125 0.350074046 -0.003873157 -0.003873157
## [6] -0.003873157 -0.003873157
##
## $boxes$x2
## [1] 0.7496252 0.4793284 0.2596990 0.1218022 0.4061186 0.6989578 1.0058594
##
## $boxes$y2
## [1] 1.0002604 0.7279933 0.4557262 0.1017790 0.1017790 0.1017790 0.1017790
##
##
## $split.labs
## [1] ""
##
## $split.cex
## [1] 1 1 1 1 1 1 1
##
## $split.box
## $split.box$x1
## [1] 0.53314789 0.22961248 0.05003993 NA NA NA NA
##
## $split.box$y1
## [1] 0.8271082 0.5548411 0.2825740 NA NA NA NA
##
## $split.box$x2
## [1] 0.8280343 0.6191011 0.3594150 NA NA NA NA
##
## $split.box$y2
## [1] 0.8858039 0.6135368 0.3412697 NA NA NA NA
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.rules with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.

## rule
## 8 IF sub_grade_E5 >= 0.5 & emp_length_X9.years >= 0.5 & inq_last_6mths >= 3.5
## cover
## 8 0%
## [1] 29658
## $obj
## n= 29777
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 29777 1 Normal (3.358297e-05 9.999664e-01)
## 2) total_rec_late_fee>=87.02027 37 1 Normal (2.702703e-02 9.729730e-01)
## 4) pub_rec>=0.5 1 0 Anomaly (1.000000e+00 0.000000e+00) *
## 5) pub_rec< 0.5 36 0 Normal (0.000000e+00 1.000000e+00) *
## 3) total_rec_late_fee< 87.02027 29740 0 Normal (0.000000e+00 1.000000e+00) *
##
## $snipped.nodes
## NULL
##
## $xlim
## [1] -0.2 1.2
##
## $ylim
## [1] -0.15 1.15
##
## $x
## [1] 0.60738129 0.27793723 0.05830786 0.49756661 0.93682535
##
## $y
## [1] 0.93276038 0.52435976 0.03427902 0.03427902 0.03427902
##
## $branch.x
## [,1] [,2] [,3] [,4] [,5]
## x 0.6073813 0.2779372 0.05830786 0.4975666 0.9368254
## NA 0.2779372 0.05830786 0.4975666 0.9368254
## NA 0.6073813 0.27793723 0.2779372 0.6073813
##
## $branch.y
## [,1] [,2] [,3] [,4] [,5]
## y 1.02051 0.6121098 0.1220290 0.1220290 0.1220290
## NA 0.8335647 0.4251641 0.4251641 0.8335647
## NA 0.8335647 0.4251641 0.4251641 0.8335647
##
## $labs
## [1] "Normal\n1 / 29777" "Normal\n1 / 37" "Anomaly\n0 / 1"
## [4] "Normal\n0 / 36" "Normal\n0 / 29740"
##
## $cex
## [1] 1
##
## $boxes
## $boxes$x1
## [1] 0.51073356 0.20097700 -0.03058419 0.42060637 0.84017762
##
## $boxes$y1
## [1] 0.88316255 0.47476193 -0.01531881 -0.01531881 -0.01531881
##
## $boxes$x2
## [1] 0.7040290 0.3548975 0.1471999 0.5745268 1.0334731
##
## $boxes$y2
## [1] 1.0205104 0.6121098 0.1220290 0.1220290 0.1220290
##
##
## $split.labs
## [1] ""
##
## $split.cex
## [1] 1 1 1 1 1
##
## $split.box
## $split.box$x1
## [1] 0.3717279 0.1275963 NA NA NA
##
## $split.box$y1
## [1] 0.7954125 0.3870119 NA NA NA
##
## $split.box$x2
## [1] 0.8430347 0.4282782 NA NA NA
##
## $split.box$y2
## [1] 0.8717169 0.4633163 NA NA NA
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.rules with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.

## rule cover
## 4 IF total_rec_late_fee >= 87 & pub_rec >= 0.5 0%